Part 1: Reading in the Data

First, we need to read in the data and join the training and test data.

train_data <- read_csv("reddit_stress_data/dreaddit-train.csv")
## Rows: 2838 Columns: 116
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (4): subreddit, post_id, sentence_range, text
## dbl (112): id, label, confidence, social_timestamp, social_karma, syntax_ari...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
test_data <- read_csv("reddit_stress_data/dreaddit-test.csv")
## Rows: 715 Columns: 116
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (4): subreddit, post_id, sentence_range, text
## dbl (112): id, label, confidence, social_timestamp, social_karma, syntax_ari...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
reddit_stress_data <- add_row(train_data, test_data)

Now we need to find the word distributions. We’ll start by unnesting the tokens and training this on the full dataset.

words_tokenized <- reddit_stress_data %>%
  select(c("id", "text", "label", "subreddit")) %>%
  unnest_tokens(word, text) %>%
  mutate(word = gsub('[[:punct:]]+','', word)) %>%
  mutate(word = gsub('\\<[[:digit:]]+\\>', '%d%', word)) %>%
  anti_join(stop_words)
## Joining, by = "word"
words_tokenized_test <- test_data %>%
  select(c("id", "text", "label", "subreddit")) %>%
  unnest_tokens(word, text) %>%
  mutate(word = gsub('[[:punct:]]+','', word)) %>%
  mutate(word = gsub('\\<[[:digit:]]+\\>', '%d%', word)) %>%
  anti_join(stop_words)
## Joining, by = "word"
words_tokenized_train <- train_data %>%
  select(c("id", "text", "label", "subreddit")) %>%
  unnest_tokens(word, text) %>%
  mutate(word = gsub('[[:punct:]]+','', word)) %>%
  mutate(word = gsub('\\<[[:digit:]]+\\>', '%d%', word)) %>%
  anti_join(stop_words)
## Joining, by = "word"

Part 2: Understanding the General Structure of the Data

Getting the Labels Distribution

label_counts <- reddit_stress_data %>%
  group_by(label) %>%
  count()
plot_ly(label_counts, x = ~label, y = ~n, type = "bar")

Subreddit Distribution

subreddit_counts <- reddit_stress_data %>%
  group_by(subreddit) %>%
  count()
plot_ly(subreddit_counts, x = ~subreddit, y = ~n, kind = "bar")
## No trace type specified:
##   Based on info supplied, a 'bar' trace seems appropriate.
##   Read more about this trace type -> https://plotly.com/r/reference/#bar
## Warning: 'bar' objects don't have these attributes: 'kind'
## Valid attributes include:
## 'type', 'visible', 'showlegend', 'legendgroup', 'opacity', 'name', 'uid', 'ids', 'customdata', 'meta', 'selectedpoints', 'hoverinfo', 'hoverlabel', 'stream', 'transforms', 'uirevision', 'x', 'x0', 'dx', 'y', 'y0', 'dy', 'xperiod', 'yperiod', 'xperiod0', 'yperiod0', 'xperiodalignment', 'yperiodalignment', 'text', 'texttemplate', 'hovertext', 'hovertemplate', 'textposition', 'insidetextanchor', 'textangle', 'textfont', 'insidetextfont', 'outsidetextfont', 'constraintext', 'cliponaxis', 'orientation', 'base', 'offset', 'width', 'marker', 'offsetgroup', 'alignmentgroup', 'selected', 'unselected', 'r', 't', '_deprecated', 'error_x', 'error_y', 'xcalendar', 'ycalendar', 'xaxis', 'yaxis', 'idssrc', 'customdatasrc', 'metasrc', 'hoverinfosrc', 'xsrc', 'ysrc', 'textsrc', 'texttemplatesrc', 'hovertextsrc', 'hovertemplatesrc', 'textpositionsrc', 'basesrc', 'offsetsrc', 'widthsrc', 'rsrc', 'tsrc', 'key', 'set', 'frame', 'transforms', '_isNestedKey', '_isSimpleKey', '_isGraticule', '_bbox'

Labels By Subreddit

ggplot(reddit_stress_data, aes(y = label), height = 100, width = 50) + geom_boxplot(fill = "steelblue") + labs(title = "Labels by Subreddit") + facet_grid(label ~ subreddit)

Part 3: Visualizing the Words in Dataset and Reduction of Features

First, to get a better idea of what the overall word distribution is, let’s plot all of the word frequencies.

my_top_word_counts <- words_tokenized %>%
    count(word) %>%
    arrange(desc(n))
ggplot(my_top_word_counts, aes(y = n)) + geom_boxplot(fill = "steelblue")

What are the top words that appear in the original dataset?

head(my_top_word_counts, 5)
## # A tibble: 5 x 2
##   word      n
##   <chr> <int>
## 1 im     2235
## 2 %d%    2210
## 3 dont   1184
## 4 feel    967
## 5 time    967

Although some words appear very frequently, most of the words barely appear compared to the top words. Let’s see what happens if we eliminate words based on whether they appear less than or equal to 5 times.

words_tokenized_rare_words_removed <- ReplaceRareWords(reddit_stress_data, rare_defn = 5)
words_tokenized_rare_words_removed %>%
  count(word) %>%
  arrange(desc(n)) %>%
  ggplot(aes(y = n)) + geom_boxplot(fill = "steelblue")

There seem to still be significant outliers in the dataset. I’m going to look at the one significant outlier and see what it is.

words_tokenized_rare_words_removed %>%
  count(word) %>%
  arrange(desc(n)) %>%
  head(5)
## # A tibble: 5 x 2
##   word          n
##   <chr>     <int>
## 1 unk       14453
## 2 text_im    2235
## 3 text_%d%   2210
## 4 text_dont  1184
## 5 text_feel   967

Defining the cut off for rare words removal to be 5 reduces the number of words somewhat but the data is still heavily right skewed. Let’s try setting the cut off to 15.

words_tokenized_rare_words_removed <- ReplaceRareWords(reddit_stress_data, rare_defn = 15)
words_tokenized_rare_words_removed %>%
  count(word) %>%
  arrange(desc(n)) %>%
  ggplot(aes(y = n)) + geom_boxplot(fill = "steelblue")

We have removed a significant number of words but the data is still heavily right skewed. It looks like there is one significant outlier that is skewing the data.

words_tokenized_rare_words_removed %>%
  count(word) %>%
  arrange(desc(n)) %>%
  head(5)
## # A tibble: 5 x 2
##   word          n
##   <chr>     <int>
## 1 unk       29766
## 2 text_im    2235
## 3 text_%d%   2210
## 4 text_dont  1184
## 5 text_feel   967
length(my_top_word_counts$word)
## [1] 12059

Part 4: Visualizing Top 20 Most Common Words Among the Data

Now let’s see the most common words among the data (overall).

GetTopNMostCommonWords <- function(df, num) {
  top_word_counts <- df %>%
    count(word) %>%
    arrange(desc(n))
  return (head(top_word_counts, num))
}
num <- 20
top_10_full_data <- GetTopNMostCommonWords(words_tokenized, num)

Now I will plot the rop 20 most common words in the dataset

ggplot(top_10_full_data, aes(x = reorder(word, desc(n)), y = n)) + geom_col(fill = "steelblue") + labs(title = "Top 10 Words from the Full Dataset", x = "Word", y = "Frequency")

Now let’s see how this varies among label: stressed or non-stressed.

stressed_data <- filter(words_tokenized, label == 0)
non_stressed_data <- filter(words_tokenized, label == 1)

Now let’s plot them

ggplot(GetTopNMostCommonWords(non_stressed_data, num), aes(x = reorder(word, desc(n)), y = n)) + geom_col(fill = "steelblue") + labs(title = "Top 10 Words from the Non-Stressed Dataset", x = "Word", y = "Frequency")

Now let’s see the difference among stressed data.

ggplot(GetTopNMostCommonWords(stressed_data, num), aes(x = reorder(word, desc(n)), y = n)) + geom_col(fill = "steelblue") + labs(title = "Top 10 Words from the Stressed Dataset", x = "Word", y = "Frequency")

# Exploring Differences in Subreddit Data Now we’re going to examine the differences by subreddit. First, we will see what unique subreddits have been selected. For each subreddit, I want to examine the difference between the labels and the different words among each label.

unique(reddit_stress_data$subreddit)
##  [1] "ptsd"             "assistance"       "relationships"    "survivorsofabuse"
##  [5] "domesticviolence" "anxiety"          "homeless"         "stress"          
##  [9] "almosthomeless"   "food_pantry"

I’m interested in understanding how the data is distributed among each of these subreddits.

ggplot(reddit_stress_data, aes(x = subreddit)) + geom_bar(fill = "steelblue")

PTSD Subreddit

Now, let’s check out the ptsd subreddit.

ptsd_data <- filter(words_tokenized, subreddit == "ptsd")
ggplot(filter(reddit_stress_data, subreddit == "ptsd"), aes(x = label)) + geom_bar(fill = "steelblue") + labs(title = "PTSD Data by Label")

ggplot(GetTopNMostCommonWords(ptsd_data, num), aes(x = reorder(word, desc(n)), y = n)) + geom_col(fill = "steelblue") + labs(title = "Top 10 Words from the PTSD Subreddit", x = "Word", y = "Frequency")

## Domestic Violence Subreddit Now, let’s check out the domestic violence subreddit.

domestic_violence_data <- filter(words_tokenized, subreddit == "domesticviolence")
ggplot(filter(reddit_stress_data, subreddit == "domesticviolence"), aes(x = label)) + geom_bar(fill = "steelblue") + labs(title = "Domestic Violence Data by Label")

ggplot(GetTopNMostCommonWords(domestic_violence_data, num), aes(x = reorder(word, desc(n)), y = n)) + geom_col(fill = "steelblue") + labs(title = "Top 10 Words from the Domestic Violence Subreddit", x = "Word", y = "Frequency")

## Almost Homeless Subreddit Now let’s check out the almost homeless subreddit

almost_homeless_data <- filter(words_tokenized, subreddit == "almosthomeless")
ggplot(filter(reddit_stress_data, subreddit == "almosthomeless"), aes(x = label)) + geom_bar(fill = "steelblue") + labs(title = "Almost Homeless Data by Label")

ggplot(GetTopNMostCommonWords(almost_homeless_data, num), aes(x = reorder(word, desc(n)), y = n)) + geom_col(fill = "steelblue") + labs(title = "Top 10 Words from the Almost Homeless Subreddit", x = "Word", y = "Frequency")

## Assistance Subreddit

assistance_subreddit <- filter(words_tokenized, subreddit == "assistance")
ggplot(filter(reddit_stress_data, subreddit == "assistance"), aes(x = label)) + geom_bar(fill = "steelblue") + labs(title = "Assistance Data by Label")

ggplot(GetTopNMostCommonWords(assistance_subreddit, num), aes(x = reorder(word, desc(n)), y = n)) + geom_col(fill = "steelblue") + labs(title = "Top 20 Words from the Assistance Subreddit", x = "Word", y = "Frequency")

Part 4: Visualizing the Distribution of Sentiment

Overall

ggplot(reddit_stress_data, aes(x = sentiment)) + geom_boxplot(fill = "steelblue", bins = 50) + labs(title = "Distribution of Sentiment")
## Warning: Ignoring unknown parameters: bins

## By Label

ggplot(reddit_stress_data, aes(x = sentiment)) + geom_boxplot(fill = "steelblue", bins = 50) + labs(title = "Distribution of Sentiment") + facet_wrap(~ label)
## Warning: Ignoring unknown parameters: bins

## By Subreddit

ggplot(reddit_stress_data, aes(x = sentiment)) + geom_boxplot(fill = "steelblue", bins = 50) + labs(title = "Distribution of Sentiment") + facet_wrap(~ subreddit)
## Warning: Ignoring unknown parameters: bins

By Label and Subreddit

ggplot(reddit_stress_data, aes(x = sentiment)) + geom_boxplot(fill = "steelblue", bins = 50) + labs(title = "Distribution of Sentiment") + facet_grid(subreddit ~ label)
## Warning: Ignoring unknown parameters: bins